This TB hackday script uses (pre-processed) RNA sequencing data from the following study to identify granuloma-associated T cell genes:
Then the expression of these genes can be further examined and validated using pseudo-bulk transcriptomic data from scRNAseq data in this study:
Foreman et al. compared the gene expression of T cells isolated from PBMC vs. T cells from granulomas (n=4 NHP, n=23 granulomas). They identified genes that were associated with granuloma T cells, and specifically identied genes that were correlated with Mtb burden in the granuloma (CFU).
Foreman et al. 2023 Granuloma-associated T cell genes
Load relevant packages. Change <data_dir> variable
as appropriate.
Load the pre-processed RNA sequencing data. There are two important files:
foreman_et_al_counts.csv contains log2-transformed
counts per million (log2-CPM), that were computed from raw counts by the
study authors. The table contains 84 columns, with one column
gene_id and the remaining columns matching
sampleids in the metadata. There are 30,689 genes in the
dataset.
foreman_etal_meta.csv contains all the 83 sample-
and granuloma-level metadata that is available for these samples and
animals. There are 27 CD8+ T cell samples sorted from granulomas and 31
CD4+ T cell samples sorted from granulomas. Other variables include
subject, sort, condition,
sex, and granuloma CFU.
library(readr)
library(tidyverse)
library(edgeR)
library(kimma)
library(ggplot2)
library(dplyr)
library(ggrepel)
# NOTE --- REPLACE the <data_dir> FOLDER DESTINTATION AS APPROPRIATE
data_dir <- '/fh/fast/gilbert_p/fg_data/SEATRAC/TB_hackday_2024/processed_data'
# These are already log2-CPM
ncts <- readr::read_csv(file.path(data_dir, "foreman_etal_counts.csv"))
meta <- readr::read_csv(file.path(data_dir, "foreman_etal_meta.csv"))
The sampleid columns of the ncts variable
and the rows of sampleid in the meta variable
match. For this first analysis we will focus on the CD8 T cells sorted
from PBMC or granulomas, creating subset tables indicated by
_ss variables. Then we initialize the DGEList
object and create a limma voom model with a
design matrix to identify genes that are associated with granuloma Mtb
burden (CFU).
In the accompanying “mean-variace” plot the x-axis represents the
average expression levels of genes across all samples. The y-axis
represents the square-root of the variance (i.e., standard deviation) of
gene expression levels. It shows how the variance changes with respect
to the mean expression. Every dot is a gene and the trend line shows the
relationship between the mean and the variance. Note that the variance
is relatively stable across expression levels and the relationship is
smooth; this is good for analysis and voom will use this
relationship to adjust the model fits of each gene. If you re-run the
code block without the filtering you will see the impact on the
mean-variance plot.
meta_ss = meta %>% filter(condition == "CD8_gran")
keep_ids = meta_ss %>% pull(sampleid)
keep_ids = c('gene_id', keep_ids)
ncts_ss = ncts %>% select(any_of(keep_ids))
# Discard genes that have low counts/prevalence
filter = rowSums(ncts_ss > 1) >= (0.5 * ncol(ncts_ss))
ncts_ss = ncts_ss[filter, ]
# Create the object for differential expression testing
dge_o = DGEList(counts=ncts_ss,
genes=ncts_ss[, 1],
samples=meta_ss,
group=meta_ss[['CFU']])
## Setting first column of `counts` as gene annotation.
# Compute weights
# ncts_ss <- calcNormFactors(dge_o, method = "TMM")
# Specify the model/design matrix
design_temp = model.matrix(~CFU, data=meta_ss)
# Create the voom object and fit the model
v <- voom(dge_o, design=design_temp, plot=TRUE)
# vvwts <- voomWithQualityWeights(dge_o, design=design_temp, normalize.method="none", plot=TRUE)
fit = lmFit(v, design_temp)
# Estimate contrasts and p-values
fit = eBayes(fit, robust=TRUE)
summary(decideTests(fit, adjust.method="fdr", p.value = 0.05))
## (Intercept) CFU
## Down 0 0
## NotSig 0 11372
## Up 11372 0
results <- topTable(fit, adjust="BH", coef="CFU", p.value=1, number=Inf, resort.by="P")
head(results %>% select(gene_id, logFC, AveExpr, P.Value, adj.P.Val), 20)
## gene_id logFC AveExpr P.Value adj.P.Val
## 7215 OSGEPL1 -7.551531e-05 6.045068 2.905859e-05 0.1676882
## 1970 CYHYorf15B -9.964047e-05 5.653331 2.949141e-05 0.1676882
## 9554 STX3 -8.007373e-05 5.338798 9.319884e-05 0.3532857
## 11186 ZNF500 -9.459217e-05 5.294934 1.536120e-04 0.4367190
## 1285 CCL1 -1.038526e-04 5.823269 1.956488e-04 0.4449837
## 4293 LOC100424917 -8.809948e-05 5.679183 3.059415e-04 0.4999033
## 10788 WDSUB1 -8.347674e-05 5.396688 3.077139e-04 0.4999033
## 3958 KDELC2 -8.726556e-05 5.084265 4.218371e-04 0.5668298
## 1840 CRIP2 -8.841231e-05 5.173913 4.735072e-04 0.5668298
## 2529 ENPP5 -8.621299e-05 5.319643 5.449366e-04 0.5668298
## 4284 LOC100424303 -8.519118e-05 5.508063 5.482877e-04 0.5668298
## 3181 GMPR -1.036991e-04 5.419739 7.558196e-04 0.6929904
## 7871 PRDM10 -5.962352e-05 5.427453 8.439460e-04 0.6929904
## 9732 TBXA2R -7.348790e-05 5.632555 8.660223e-04 0.6929904
## 539 ARMT1 -8.102035e-05 5.841423 9.140745e-04 0.6929904
## 9328 SORBS1 -7.333526e-05 4.905782 1.047444e-03 0.7444706
## 9477 ST3GAL5 -8.008782e-05 5.740576 1.213585e-03 0.7723764
## 4286 LOC100424418 -8.122010e-05 4.955814 1.349849e-03 0.7723764
## 8622 RPS6KA4 -6.413657e-05 5.826126 1.405609e-03 0.7723764
## 3797 INVS -5.160037e-05 5.742617 1.467927e-03 0.7723764
# Add a column for significance based on FDR
results <- results %>%
mutate(Significance = ifelse(adj.P.Val < 0.2, "Significant", "Not Significant"))
# Select the top 10 genes based on adjusted p-value for labeling
top_genes <- results %>%
arrange(adj.P.Val) %>%
slice_head(n = 10)
max_logFC <- max(abs(results$logFC), na.rm = TRUE)
# Create the volcano plot
volcano_plot <- ggplot(results, aes(x = logFC, y = -log10(P.Value))) +
geom_point(aes(color = Significance), alpha = 0.6) +
scale_color_manual(values = c("Significant" = "red", "Not Significant" = "grey")) +
geom_text_repel(data = top_genes,
aes(label = gene_id),
max.overlaps = 10,
box.padding = 0.3,
point.padding = 0.3,
segment.color = "grey50",
size = 3) +
xlim(c(-max_logFC, max_logFC)) +
theme_minimal() +
labs(
x = "log2 Fold-change",
y = "-log10 P-value",
color = "FDR < 0.05") +
theme(plot.title = element_text(hjust = 0.5))
volcano_plot
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Redo the analysis using a mixed-effects model to account for the longitudinal design
dge_o = DGEList(counts=ncts_ss,
genes=ncts_ss[, 1],
samples=meta_ss,
group=meta_ss[['CFU']])
## Setting first column of `counts` as gene annotation.
ncts_ss <- calcNormFactors(dge_o, method = "TMM")
design_temp=model.matrix(~CFU, data=meta_ss)
v <- voom(dge_o, design=design_temp, plot=FALSE)
# Can't figure out why I get this error here
# lme/lmerel model: expression~protect_outcome+sex+visit+(1|animalid)
# Error in `[.data.frame`(weights.format, order(rownames(weights.format)), :
# undefined columns selected
#klm <- kmFit(dat = v,
# model = "~CFU + (1|subject)",
# run_lme = TRUE,
# libraryID="sampleid",
# patientID="subject",
# use_weights = TRUE,
# metrics = TRUE,
# run_contrast = FALSE,
# processors=1)
#summarise_kmFit(fdr = klm$lm)
#plot_volcano(model_result = klm,
# model = "lme", variables = "CFU",
# y_cutoff = 0.05)